Read in files
sortedclin<-readRDS(file="hnsc_clin_sorted.RDS")
sortedcpg<-readRDS(file="hnsc_cpg1000_sorted.RDS")
dim(sortedclin)
## [1] 528 79
dim(sortedcpg)
## [1] 528 1000
Packages
library(dplyr)
library(tidyr)
library(Rtsne)
library(pals)
Dimension reduction of cpgs using TSNE:
set.seed(123)
cpgtsne<-Rtsne(sortedcpg, perplexity=15, exaggeration_factor=5)$Y
par(pty="s")
plot(cpgtsne, xlab= "TSNE dim 1", ylab="TSNE dim 2")
From the above, it appears there are about 5 clusters.
library(umap)
## Warning: package 'umap' was built under R version 4.0.5
umap.defaults
## umap configuration parameters
## n_neighbors: 15
## n_components: 2
## metric: euclidean
## n_epochs: 200
## input: data
## init: spectral
## min_dist: 0.1
## set_op_mix_ratio: 1
## local_connectivity: 1
## bandwidth: 1
## alpha: 1
## gamma: 1
## negative_sample_rate: 5
## a: NA
## b: NA
## spread: 1
## random_state: NA
## transform_state: NA
## knn: NA
## knn_repeats: 1
## verbose: FALSE
## umap_learn_args: NA
myumapsetting<-umap.defaults
myumapsetting$spread<-4 #used during automatic estimation of a/b parameters
myumapsetting$min_dist<-0.5 #must be smaller than "spread"; how close points appear in final layout
myumap<-umap(sortedcpg, myumapsetting)
par(pty="s")
plot(myumap$layout)
Do any of these overlay with known factors, especially nuisance factors? We can extract some factors of interest from the clinical table to overlay on top of TSNE. It makes sense to choose columns that has less than 10 levels.
First, drop any column that has ALL NA values:
isna<-apply(sortedclin, 2, function(x) sum(is.na(x))==nrow(sortedclin))
table(isna)
## isna
## FALSE TRUE
## 74 5
sortedclin<-sortedclin[,-which(isna)]
numlevels<-apply(sortedclin, 2, function(x) length(unique(x)))
# sort(numlevels)
names(numlevels)[which(numlevels<=10)]
## [1] "patient.histologic_diagnosis"
## [2] "patient.laterality"
## [3] "patient.prospective_collection"
## [4] "patient.retrospective_collection"
## [5] "patient.gender"
## [6] "patient.race"
## [7] "patient.ethnicity"
## [8] "patient.history_other_malignancy"
## [9] "patient.history_neoadjuvant_treatment"
## [10] "patient.lymph_node_neck_dissection_indicator"
## [11] "patient.lymph_node_dissection_method...17"
## [12] "patient.lymph_node_dissection_method...18"
## [13] "patient.lymph_nodes_examined"
## [14] "patient.lymph_nodes_examined_ihc_count"
## [15] "patient.margin_status"
## [16] "patient.egfr_amplification_status"
## [17] "patient.vital_status"
## [18] "patient.tumor_status"
## [19] "patient.ajcc_staging_edition"
## [20] "patient.ajcc_tumor_pathologic_pt"
## [21] "patient.ajcc_nodes_pathologic_pn"
## [22] "patient.ajcc_metastasis_pathologic_pm"
## [23] "patient.ajcc_pathologic_tumor_stage"
## [24] "patient.extracapsular_spread_pathologic"
## [25] "patient.tumor_grade"
## [26] "patient.lymphovascular_invasion"
## [27] "patient.perineural_invasion"
## [28] "patient.hpv_status_p16"
## [29] "patient.hpv_status_ish"
## [30] "patient.tobacco_smoking_history_indicator"
## [31] "patient.alcohol_history_documented"
## [32] "patient.alcohol_consumption_frequency"
## [33] "patient.radiation_treatment_adjuvant"
## [34] "patient.pharmaceutical_tx_adjuvant"
## [35] "patient.treatment_outcome_first_course"
## [36] "patient.new_tumor_event_dx_indicator"
## [37] "patient.clinical_M"
## [38] "patient.clinical_N"
## [39] "patient.clinical_T"
## [40] "patient.clinical_stage"
## [41] "patient.days_to_initial_pathologic_diagnosis"
## [42] "patient.icd_o_3_histology"
## [43] "patient.informed_consent_verified"
## [44] "patient.tumor_tissue_site"
## [45] "fu48.vital_status"
## [46] "fu48.tumor_status"
## [47] "fu48.treatment_outcome_first_course"
## [48] "fu48.tobacco_smokeless_use_at_dx"
## [49] "rad.treatment_best_response"
## [50] "rad.radiation_therapy_site"
## [51] "chemoyes"
interesting<-names(numlevels)[which(numlevels<=10)]
subsetclin<-sortedclin[,interesting]
# pal.bands(glasbey(), show.names=TRUE)
mycolorpal<-glasbey(12)
for (i in 1:length(interesting)){
plot(cpgtsne, col=as.factor(subsetclin[,i]), main=interesting[i])
legend('topright', legend = levels(as.factor(subsetclin[,i])), col = 1:3, cex = 0.8, pch = 1)
}
patient.gender, patient.hpv_status_p16, and patient.hpv_status_ish associate with the visual “clusters” we see with our 2-dimensional TSNE.
library(Mercator)
## Loading required package: Thresher
## Loading required package: ClassDiscovery
## Loading required package: cluster
## Loading required package: oompaBase
## Loading required package: PCDimension
library(clValid)
## Warning: package 'clValid' was built under R version 4.0.5
clValid can compare internal validation metrics between Kmeans, hierarchical, and pam clustering for different values of K.
myclmethods<-c("hierarchical", "kmeans", "pam")
clvalidres<-clValid(sortedcpg, nClust=2:10, clMethods = myclmethods, validation="internal")
clvalidres@measures
## , , hierarchical
##
## 2 3 4 5 6
## Connectivity 8.7869048 14.6448413 14.64484127 56.11071429 56.11071429
## Dunn 0.5333043 0.5333043 0.53330425 0.45304181 0.45304181
## Silhouette 0.1639850 0.1249195 0.09750493 0.08811865 0.08418329
## 7 8 9 10
## Connectivity 56.11071429 56.11071429 73.68452381 76.61349206
## Dunn 0.45304181 0.45304181 0.45304181 0.45304181
## Silhouette 0.07594937 0.06454131 0.05928766 0.03623698
##
## , , kmeans
##
## 2 3 4 5 6
## Connectivity 178.1619048 240.79603175 247.22222222 322.30714286 417.40595238
## Dunn 0.3362252 0.37186285 0.36793614 0.36932335 0.34273851
## Silhouette 0.1000159 0.08897024 0.09247967 0.06231365 0.05457374
## 7 8 9 10
## Connectivity 438.82142857 444.11309524 461.75119048 538.12023810
## Dunn 0.41149264 0.38576992 0.40702368 0.37643895
## Silhouette 0.05559165 0.04462462 0.05092247 0.03689375
##
## , , pam
##
## 2 3 4 5 6
## Connectivity 186.26230159 260.28492063 392.40952381 393.73611111 432.48134921
## Dunn 0.31550735 0.35939076 0.33268764 0.35536614 0.34687141
## Silhouette 0.09474668 0.08354821 0.04424713 0.04944241 0.04574817
## 7 8 9 10
## Connectivity 415.60158730 461.37380952 502.34166667 536.68055556
## Dunn 0.35597820 0.35723111 0.37809569 0.37809569
## Silhouette 0.04371763 0.03718951 0.03237722 0.02776716
plot(clvalidres)
summary(clvalidres)
##
## Clustering Methods:
## hierarchical kmeans pam
##
## Cluster sizes:
## 2 3 4 5 6 7 8 9 10
##
## Validation Measures:
## 2 3 4 5 6 7 8 9 10
##
## hierarchical Connectivity 8.7869 14.6448 14.6448 56.1107 56.1107 56.1107 56.1107 73.6845 76.6135
## Dunn 0.5333 0.5333 0.5333 0.4530 0.4530 0.4530 0.4530 0.4530 0.4530
## Silhouette 0.1640 0.1249 0.0975 0.0881 0.0842 0.0759 0.0645 0.0593 0.0362
## kmeans Connectivity 178.1619 240.7960 247.2222 322.3071 417.4060 438.8214 444.1131 461.7512 538.1202
## Dunn 0.3362 0.3719 0.3679 0.3693 0.3427 0.4115 0.3858 0.4070 0.3764
## Silhouette 0.1000 0.0890 0.0925 0.0623 0.0546 0.0556 0.0446 0.0509 0.0369
## pam Connectivity 186.2623 260.2849 392.4095 393.7361 432.4813 415.6016 461.3738 502.3417 536.6806
## Dunn 0.3155 0.3594 0.3327 0.3554 0.3469 0.3560 0.3572 0.3781 0.3781
## Silhouette 0.0947 0.0835 0.0442 0.0494 0.0457 0.0437 0.0372 0.0324 0.0278
##
## Optimal Scores:
##
## Score Method Clusters
## Connectivity 8.7869 hierarchical 2
## Dunn 0.5333 hierarchical 2
## Silhouette 0.1640 hierarchical 2
Connectivity should be minimized; Dunn index should be maximized; Silhouette width should be maximized.
myK<-seq(from=2, to=7, by=1)
res_kmeans2<-kmeans(x=as.matrix(sortedcpg), centers=2, iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong"), trace=FALSE)
mykmeansclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mykmeansres = kmeans(x=as.matrix(sortedcpg), centers=myK[i],
iter.max = 10, nstart = 1,
algorithm = c("Hartigan-Wong"), trace=FALSE)
mykmeansclusters [,i] <-mykmeansres$cluster
}
colnames(mykmeansclusters)<-paste("Kmeans",myK, sep="_")
res_pam2<-pam(x=LFt, k=2, metric = “euclidean”, stand = FALSE)
mypamclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
mypamres = pam(x=as.matrix(sortedcpg), k=myK[i],
metric = "euclidean", stand = FALSE)
mypamclusters [,i] <-mypamres$clustering
}
colnames(mypamclusters)<-paste("PAM",myK, sep="_")
library(kernlab)
myspeccclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
myspeccres<-specc(as.matrix(sortedcpg), centers=myK[i])
myspeccclusters [,i] <-myspeccres@.Data
}
colnames(myspeccclusters)<-paste("specc",myK, sep="_")
library(Mercator)
my.binmat <- BinaryMatrix(as.matrix(t(sortedcpg)))
myjaccclusters <-matrix(NA, ncol=length(myK), nrow=nrow(sortedcpg))
for (i in 1:length(myK)){
jacc.Vis <- Mercator(my.binmat, "jaccard", "hclust", K=myK[i])
myjaccclusters [,i] <-jacc.Vis@clusters
}
colnames(myjaccclusters)<-paste("jacc",myK, sep="_")
library(mclust)
## Package 'mclust' version 5.4.7
## Type 'citation("mclust")' for citing this R package in publications.
# compute BIC for all covariance structures and up to 9 components
# Gives a glimpse as to which model-component pairs are best for our data according to BIC
myG <- myK
mybic<-mclustBIC(data=sortedcpg, G=myG)
mybic
## Bayesian Information Criterion (BIC):
## EII VII EEI VEI EVI VVI
## 2 -91604.96 -91055.72 -83412.64 -82811.572 -80363.058 -78692.759
## 3 -75078.23 -65829.68 -65682.26 -52879.533 -59961.536 -52861.351
## 4 -53979.90 -49013.18 -41976.95 -35383.024 -32933.871 -30840.522
## 5 -45422.72 -36711.45 -32288.82 -21022.994 -12148.528 -10992.574
## 6 -39823.91 -29648.30 -25669.58 -11431.538 -5798.975 -577.452
## 7 -36439.09 -25801.71 -21339.10 -7036.054 -4851.289 7741.785
##
## Top 3 models based on the BIC criterion:
## VVI,7 VVI,6 EVI,7
## 7741.785 -577.452 -4851.289
plot(mybic)
mymodelNames<-c("EII", "VII", "EEI", "VEI", "EVI", "VVI")
matBIC<-as.data.frame(matrix(mybic, ncol=length(mymodelNames), nrow=length(myG)))
colnames(matBIC)<-mymodelNames
rownames(matBIC)<-myG
top<-3
topBIC<-head(sort(mybic, decreasing=TRUE),top)
myrow<-rep(NA, top)
mycol<-rep(NA, top)
for(i in 1:length(topBIC)){
w<-which(topBIC[i]==matBIC, arr.ind = TRUE)
print(w)
myrow[i]<-rownames(matBIC)[w[1,1]]
mycol[i]<-colnames(matBIC)[w[1,2]]
}
## row col
## 7 6 6
## row col
## 6 5 6
## row col
## 7 6 5
mymclusters<-matrix(NA, nrow=nrow(sortedcpg), ncol=top)
for (i in 1:top){
resmclust<-Mclust(data=sortedcpg, G=myrow[i], modelNames = mycol[i])
mymclusters[,i]<-resmclust$classification
}
colnames(mymclusters)<-paste("mclust", mycol, myrow, sep="_")
head(mymclusters)
## mclust_VVI_7 mclust_VVI_6 mclust_EVI_7
## [1,] 1 1 1
## [2,] 7 2 2
## [3,] 7 3 5
## [4,] 3 1 3
## [5,] 4 4 4
## [6,] 1 1 1
# what if mclust with 2 groups?
resmclust2<-Mclust(data=sortedcpg, G=5, modelNames = "VVI")
par(pty="s")
plot(x=cpgtsne[,1], y=cpgtsne[,2],
bg=alphabet()[resmclust2$classification], pch=21, cex=1.2,
xlab="t-SNE dimension 1",ylab="t-SNE dimension 2")
clustersbig<-cbind(mykmeansclusters, mypamclusters, myspeccclusters,
mymclusters,
myjaccclusters) #just plotting 1
rownames(mykmeansclusters)<-rownames(sortedcpg)
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=cpgtsne[,1], y=cpgtsne[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="t-SNE dimension 1",ylab="t-SNE dimension 2",
main=colnames(clustersbig)[i])
}
par(pty="s")
for (i in 1:ncol(clustersbig)){
plot(x=myumap$layout[,1], y=myumap$layout[,2],
bg=glasbey()[clustersbig[,i]], pch=21, cex=1.2,
xlab="UMAP dimension 1",ylab="UMAP dimension 2",
main=colnames(clustersbig)[i])
}